Main Figures
Have a legend for all plots
Since we have set a parameter in our set_colors.R file that we source at the beginning and use this throughout, we can assure that the below legends will represent the legends called afterwards.
####### Legend for lakesite
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank())
# Extract the legend
season_legend <- get_legend(legend_plot)
####### Legend for lakesite
lakesite_legend <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = lakesite)) +
geom_jitter(size = 3, width = 0.2) +
scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
scale_shape_manual(values = lakesite_shapes, guide = guide_legend(override.aes = list(size = 4))) +
theme(legend.position = "bottom", axis.title.x = element_blank(),
legend.title = element_blank())
# Extract the legend
lakesite_legend <- get_legend(lakesite_legend)
Figure 1
######################################################### Fraction ABUNDANCe
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(as.numeric(fraction_bac_abund)) by fraction
## W = 0, p-value = 1.479e-06
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(as.numeric(fraction_bac_abund)))
## # A tibble: 2 x 2
## fraction `mean(as.numeric(fraction_bac_abund))`
## <fct> <dbl>
## 1 Particle 41169.
## 2 Free 734522.
# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree
poster_a <-
ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
ylab("Log10(Bacterial Counts) \n (cells/mL)") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
##### Particle vs free cell abundances
geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox
##
## Wilcoxon rank sum test
##
## data: frac_bacprod by fraction
## W = 33, p-value = 0.02418
## alternative hypothesis: true location shift is not equal to 0
metadata %>%
group_by(fraction) %>%
summarize(mean(frac_bacprod))
## # A tibble: 2 x 2
## fraction `mean(frac_bacprod)`
## <fct> <dbl>
## 1 Particle 9.96
## 2 Free 24.1
# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree
poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
ylab("Community Production \n (μgC/L/day)") +
##### Particle vs free bulk production
geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######################################################### TOTAL PRODUCTION
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox
##
## Wilcoxon rank sum test
##
## data: log10(fracprod_per_cell) by fraction
## W = 125, p-value = 6.656e-05
## alternative hypothesis: true location shift is not equal to 0
filter(metadata, norep_filter_name != "MOTEJ515") %>%
group_by(fraction) %>%
summarize(mean(fracprod_per_cell))
## # A tibble: 2 x 2
## fraction `mean(fracprod_per_cell)`
## <fct> <dbl>
## 1 Particle 0.000000482
## 2 Free 0.0000000387
# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree
poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"),
aes(y = log10(fracprod_per_cell), x = fraction)) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylim(c(-8.5, -4.9)) +
ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
##### Particle vs free per-cell production
geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
annotate("text", x=1.5, y=-5.15, size = 3.5, color = "gray40",
label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",
axis.title.x = element_blank())
######## FIGURE 1
# legend
legend <- get_legend(poster_a)
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
labels = c("A", "B", "C"),
ncol = 3, nrow = 1)
#fig_1 <-
plot_grid(row1_plots, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Linear Regressions with Fraction Diversity
# Free-Living samples only
div_measures_free <- otu_alphadiv %>%
dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Free") %>%
dplyr::select(-fraction)
# Particle-associated samples only
div_measures_part <- otu_alphadiv %>%
dplyr::select(fraction, mean, measure, frac_bacprod, fracprod_per_cell_noinf) %>%
filter(fraction == "Particle") %>%
dplyr::select(-fraction)
## All of the samples!
div_measures_all <- otu_alphadiv %>%
dplyr::select(mean, measure, frac_bacprod, fracprod_per_cell_noinf)
########################################################
############## Particle: Community-wide
lm_divs_particle_comm <- div_measures_part %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "Particle", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Particle: Community-wide
lm_divs_particle_percap <- div_measures_part %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
########################################################
############## Free: Community-wide
lm_divs_free_comm <- div_measures_free %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "Free", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## Free: Community-wide
lm_divs_free_percap <- div_measures_free %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
########################################################
############## All Samples: Community-wide
lm_divs_all_comm <- div_measures_all %>%
dplyr::select(-fracprod_per_cell_noinf) %>%
group_by(measure) %>%
do(glance(lm(frac_bacprod ~ mean, data = .))) %>%
mutate(fraction = "All Samples", dependent = "Community Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
############## All Samples: Community-wide
lm_divs_all_percap <- div_measures_all %>%
dplyr::select(-frac_bacprod) %>%
group_by(measure) %>%
do(glance(lm(log10(fracprod_per_cell_noinf) ~ mean, data = .))) %>%
mutate(fraction = "All Samples", dependent = "Per-Capita Production") %>%
dplyr::select(fraction, dependent, measure, AIC, adj.r.squared, p.value) %>%
ungroup() %>%
mutate(FDR.p = p.adjust(p.value, method = "fdr"))
## Filtered PCA Linear Model Results
# Put all of the PCA and environmental linear model results together into one dataframe
lm_div_results <-
bind_rows(lm_divs_particle_comm, lm_divs_particle_percap,
lm_divs_free_comm, lm_divs_free_percap,
lm_divs_all_comm, lm_divs_all_percap) %>%
dplyr::rename(independent_var = measure,
dependent_var = dependent)
See supplemental table 1 header for the table output!
Cross Validation
Cross validation will provide a better estimate for the adjusted R-squared value of our diversity regressions.
#################################### Community-Wide Production ####################################
################### Richness ###################
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1271 -2.8865 -0.4649 3.7537 9.8401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.269531 5.717450 -1.971 0.07701 .
## mean 0.038125 0.009868 3.863 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.476 on 10 degrees of freedom
## Multiple R-squared: 0.5988, Adjusted R-squared: 0.5587
## F-statistic: 14.93 on 1 and 10 DF, p-value: 0.003143
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Free")))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.094 -11.112 -2.755 8.611 33.308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.40793 21.62006 0.250 0.808
## mean 0.05511 0.06207 0.888 0.395
##
## Residual standard error: 17.7 on 10 degrees of freedom
## Multiple R-squared: 0.07306, Adjusted R-squared: -0.01963
## F-statistic: 0.7882 on 1 and 10 DF, p-value: 0.3955
# Without the 2 high samples
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle" & mean < 700)))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Particle" & mean < 700))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7578 -1.6875 -0.6392 2.8807 6.3336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.44152 8.50828 -0.640 0.540
## mean 0.02559 0.01701 1.505 0.171
##
## Residual standard error: 4.36 on 8 degrees of freedom
## Multiple R-squared: 0.2205, Adjusted R-squared: 0.1231
## F-statistic: 2.264 on 1 and 8 DF, p-value: 0.1709
rich_test_df <-
richness %>%
mutate(frac_boolean = (fraction == "Particle")*1)
# Correlation of richness between PA and FL
rich_test_df_sorted <- arrange(rich_test_df, norep_water_name)
rich_test_df_sorted_PA <- filter(rich_test_df_sorted, fraction == "Particle")
rich_test_df_sorted_FL <- filter(rich_test_df_sorted, fraction == "Free")
cor.test(rich_test_df_sorted_PA$mean, rich_test_df_sorted_FL$mean)
##
## Pearson's product-moment correlation
##
## data: rich_test_df_sorted_PA$mean and rich_test_df_sorted_FL$mean
## t = 0.79099, df = 10, p-value = 0.4473
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3848365 0.7167446
## sample estimates:
## cor
## 0.2426582
# Look into car::vif() for multicollinearity
cor(rich_test_df$mean, rich_test_df$frac_boolean)
## [1] 0.6508802
summary(lm(frac_bacprod ~ mean + frac_boolean, data = rich_test_df))
##
## Call:
## lm(formula = frac_bacprod ~ mean + frac_boolean, data = rich_test_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.525 -5.808 -1.127 5.678 34.828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.95501 7.87833 1.264 0.22022
## mean 0.04167 0.02055 2.028 0.05545 .
## frac_boolean -23.20045 6.89481 -3.365 0.00293 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.82 on 21 degrees of freedom
## Multiple R-squared: 0.3512, Adjusted R-squared: 0.2894
## F-statistic: 5.684 on 2 and 21 DF, p-value: 0.01064
PA_rich_residplot <- plot_residuals(lm_prod_vs_rich_PA, filter(richness, fraction == "Particle")$frac_bacprod, "Community Production vs Particle-Associated Richness")
## [1] "There are no high leverage points in this model."
# Cross validate particle-associated for a better estimate of the adjusted R-squared
cv_lm_prod_vs_rich_PA <- train(
frac_bacprod ~ mean, data = filter(richness, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_rich_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.878977 0.6666884 5.5294 2.263403 0.2802459 1.908415
################### Shannon Entropy ###################
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_prod_vs_shannon_PA <- train(
frac_bacprod ~ mean, data = filter(shannon, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_shannon_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.620109 0.6805392 5.226135 2.393256 0.285612 2.069473
################### Inverse Simpson ###################
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_prod_vs_invsimps_PA)
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5589 -2.1093 -0.1969 0.8752 7.8282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.45199 2.43866 -0.185 0.85666
## mean 0.29344 0.05781 5.076 0.00048 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.571 on 10 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.6925
## F-statistic: 25.77 on 1 and 10 DF, p-value: 0.0004804
# Without the 2 high samples
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Particle" & mean < 70))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3309 -1.6323 -0.3605 1.4988 6.5349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.61837 2.49960 0.647 0.5355
## mean 0.20554 0.08156 2.520 0.0358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.688 on 8 degrees of freedom
## Multiple R-squared: 0.4425, Adjusted R-squared: 0.3729
## F-statistic: 6.351 on 1 and 8 DF, p-value: 0.03581
PA_invsimps_residplot <- plot_residuals(lm_prod_vs_invsimps_PA, filter(invsimps, fraction == "Particle")$frac_bacprod, "Community Production vs Particle-Associated Inverse Simpson")
## [1] "There are no high leverage points in this model."
invsimps_test_df <-
invsimps %>%
mutate(frac_boolean = (fraction == "Particle")*1)
cor(invsimps_test_df$mean, invsimps_test_df$frac_boolean)
## [1] 0.3165699
summary(lm(frac_bacprod ~ mean + frac_boolean, data = invsimps_test_df))
##
## Call:
## lm(formula = frac_bacprod ~ mean + frac_boolean, data = invsimps_test_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.355 -5.937 -0.176 2.676 37.452
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.3742 5.2053 3.146 0.00488 **
## mean 0.3189 0.1527 2.089 0.04907 *
## frac_boolean -17.7310 5.4908 -3.229 0.00402 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.76 on 21 degrees of freedom
## Multiple R-squared: 0.3576, Adjusted R-squared: 0.2965
## F-statistic: 5.846 on 2 and 21 DF, p-value: 0.009586
# Correlation of inverse Simpson's index between PA and FL
invsimps_test_df_sorted <- arrange(invsimps_test_df, norep_water_name)
invsimps_test_df_sorted_PA <- filter(invsimps_test_df_sorted, fraction == "Particle")
invsimps_test_df_sorted_FL <- filter(invsimps_test_df_sorted, fraction == "Free")
cor.test(invsimps_test_df_sorted_PA$mean, invsimps_test_df_sorted_FL$mean)
##
## Pearson's product-moment correlation
##
## data: invsimps_test_df_sorted_PA$mean and invsimps_test_df_sorted_FL$mean
## t = 1.7, df = 10, p-value = 0.12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1378559 0.8235989
## sample estimates:
## cor
## 0.4735076
cv_lm_prod_vs_invsimps_PA <- train(
frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_invsimps_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 5.325326 0.7605251 4.051095 2.131925 0.2539022 1.816429
################### Simpson's Evenness ###################
lm_prod_vs_simpseven_PA <- lm(frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_prod_vs_simpseven_PA <- train(
frac_bacprod ~ mean, data = filter(simpseven, fraction == "Particle"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_prod_vs_simpseven_PA$results # Particle Community-Wide CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 6.269156 0.6398939 4.852454 3.012038 0.324493 2.312427
#################################### Per-Capita Production ####################################
################### Richness ###################
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle"))
summary(lm_percell_prod_vs_rich_PA)
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55347 -0.21545 -0.01066 0.12536 0.58830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0617648 0.3717671 -21.685 4.44e-09 ***
## mean 0.0023794 0.0006348 3.748 0.00457 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3506 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6095, Adjusted R-squared: 0.5662
## F-statistic: 14.05 on 1 and 9 DF, p-value: 0.004567
# Dummy variable regression
summary(lm(log10(fracprod_per_cell_noinf) ~ mean + frac_boolean,
data = rich_test_df))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean + frac_boolean,
## data = rich_test_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74137 -0.19795 -0.01396 0.24432 0.63068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.3288180 0.2234148 -37.280 < 2e-16 ***
## mean 0.0022273 0.0005832 3.819 0.00107 **
## frac_boolean 0.3524489 0.1995645 1.766 0.09264 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3625 on 20 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6973, Adjusted R-squared: 0.667
## F-statistic: 23.04 on 2 and 20 DF, p-value: 6.457e-06
PA_rich_percell_residplot <- plot_residuals(lm_percell_prod_vs_rich_PA, filter(richness, fraction == "Particle")$lm(log10(fracprod_per_cell_noinf)), "Log10(Cellular Production) vs Particle-Associated Richness")
## [1] "There are no high leverage points in this model."
## Error in xy.coords(x, y, xlabel, ylabel, log): attempt to apply non-function
cv_lm_percell_prod_vs_rich_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_rich_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4135282 0.6367477 0.3318983 0.1426534 0.327894 0.1262711
################### Shannon Entropy ###################
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle"))
cv_lm_percell_prod_vs_shannon_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(shannon, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_shannon_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4070696 0.7054441 0.3276488 0.1433208 0.2981486 0.1251932
################### Inverse Simpson ###################
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle"))
summary(lm_percell_prod_vs_invsimps_PA)
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28651 -0.18384 -0.11125 0.07337 0.56444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.360961 0.159490 -46.153 5.27e-12 ***
## mean 0.018087 0.003759 4.812 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2969 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7201, Adjusted R-squared: 0.689
## F-statistic: 23.15 on 1 and 9 DF, p-value: 0.0009581
# Dummy variable regression
summary(lm(log10(fracprod_per_cell_noinf) ~ mean + frac_boolean, data = invsimps_test_df))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean + frac_boolean,
## data = invsimps_test_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68100 -0.16952 0.00247 0.13168 0.63767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.01336 0.13664 -58.646 < 2e-16 ***
## mean 0.01819 0.00401 4.536 0.000201 ***
## frac_boolean 0.64869 0.14654 4.427 0.000259 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3347 on 20 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.742, Adjusted R-squared: 0.7162
## F-statistic: 28.76 on 2 and 20 DF, p-value: 1.305e-06
PA_invsimps_percell_residplot <- plot_residuals(lm_percell_prod_vs_invsimps_PA, filter(invsimps, fraction == "Particle")$lm(log10(fracprod_per_cell_noinf)), "Log10(Cellular Production) vs Particle-Associated Inverse Simpson")
## [1] "There are no high leverage points in this model."
## Error in xy.coords(x, y, xlabel, ylabel, log): attempt to apply non-function
cv_lm_percell_prod_vs_invsimps_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_invsimps_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.3526888 0.7214097 0.2865713 0.1140385 0.3265452 0.0919178
################### Simpson's Evenness ###################
lm_percell_prod_vs_simpseven_PA <- lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle"))
cv_lm_percell_prod_vs_simpseven_PA <- train(
log10(fracprod_per_cell_noinf) ~ mean, data = filter(simpseven, fraction == "Particle" & norep_filter_name != "MOTEJ515"),
method ='lm',
trControl = trainControl(method ="repeatedcv", number = 3, repeats = 100),
tuneGrid = expand.grid(intercept = TRUE))
cv_lm_percell_prod_vs_simpseven_PA$results # Particle Per-Capita CV results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.4368212 0.6842855 0.3526242 0.1383327 0.2979329 0.1103317
Diversity distributions
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree
rich_vert_distribution_plot <-
ggplot(richness, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
ylab("Observed Richness") + xlab("Fraction") +
scale_shape_manual(values = season_shapes) +
geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=980, size = 8, color = "#424645",
label= paste("***")) +
annotate("text", x=1.5, y=880, size = 4, color = "#424645",
label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
theme(legend.position = "none", axis.title.x = element_blank())
######################################################### INVERSE SIMPSON #########################################################
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree
invsimps_vert_distribution_plot <-
ggplot(invsimps, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
ylab("Inverse Simpson") + xlab("Fraction") +
geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
theme(legend.position = "none", axis.title.x = element_blank())
plot_grid(rich_vert_distribution_plot, invsimps_vert_distribution_plot,
ncol = 2, nrow = 1, labels = c("A", "B"))
ggsave("div.png", width = 6, height = 3.5)
Figure 2
######################################################### OBSERVED RICHNESS #########################################################
rich_fraction_wilcox <- wilcox.test(mean ~ fraction, data = richness)
rich_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 129, p-value = 0.0004955
## alternative hypothesis: true location shift is not equal to 0
filter(richness) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean), min(mean), max(mean))
## # A tibble: 2 x 5
## fraction `mean(mean)` `sd(mean)` `min(mean)` `max(mean)`
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Particle 557. 167. 343. 906.
## 2 Free 338. 86.0 238. 511.
# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(920, 930, 930, 920)) # WholePart vs WholeFree
rich_distribution_plot <-
ggplot(richness, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
xlab("Observed Richness") + xlab("Fraction") +
scale_shape_manual(values = season_shapes) +
geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=980, size = 8, color = "#424645", label= paste("***"), angle = 90) +
annotate("text", x=1.5, y=790, size = 4, color = "#424645",
label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 5))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Richness vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Community-wide (Per-Liter) Production
prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=45, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
# Summary stats
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1271 -2.8865 -0.4649 3.7537 9.8401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.269531 5.717450 -1.971 0.07701 .
## mean 0.038125 0.009868 3.863 0.00314 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.476 on 10 degrees of freedom
## Multiple R-squared: 0.5988, Adjusted R-squared: 0.5587
## F-statistic: 14.93 on 1 and 10 DF, p-value: 0.003143
# WITHOUT THE TWO HIGH POINTS
summary(lm(frac_bacprod ~ mean, data = filter(richness, fraction == "Particle" & mean < 800)))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(richness, fraction ==
## "Particle" & mean < 800))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7578 -1.6875 -0.6392 2.8807 6.3336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.44152 8.50828 -0.640 0.540
## mean 0.02559 0.01701 1.505 0.171
##
## Residual standard error: 4.36 on 8 degrees of freedom
## Multiple R-squared: 0.2205, Adjusted R-squared: 0.1231
## F-statistic: 2.264 on 1 and 8 DF, p-value: 0.1709
################ Richness vs Per Capita (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2]), digits = 3), ")")
## 2. Plot Richness vs Per Capita (Per-Cell) Production
percell_prod_vs_rich_plot <-
ggplot(richness, aes(x=mean, y=log10(fracprod_per_cell_noinf))) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Observed Richness") +
geom_smooth(data=filter(richness, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(150,980), breaks = seq(from = 0, to =925, by = 150)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=790, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Summary stats
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55347 -0.21545 -0.01066 0.12536 0.58830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.0617648 0.3717671 -21.685 4.44e-09 ***
## mean 0.0023794 0.0006348 3.748 0.00457 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3506 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6095, Adjusted R-squared: 0.5662
## F-statistic: 14.05 on 1 and 9 DF, p-value: 0.004567
# WITHOUT THE TWO HIGH POINTS
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness, fraction == "Particle" & mean < 800)))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(richness,
## fraction == "Particle" & mean < 800))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39374 -0.18584 -0.00611 0.16713 0.42420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.361460 0.539363 -13.648 2.67e-06 ***
## mean 0.000882 0.001080 0.817 0.441
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2764 on 7 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.08704, Adjusted R-squared: -0.04339
## F-statistic: 0.6673 on 1 and 7 DF, p-value: 0.4409
######################################################### INVERSE SIMPSON #########################################################
invsimps_fraction_wilcox <- wilcox.test(mean ~ fraction, data = invsimps)
invsimps_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mean by fraction
## W = 81, p-value = 0.6297
## alternative hypothesis: true location shift is not equal to 0
filter(invsimps) %>%
group_by(fraction) %>%
summarize(mean(mean), sd(mean))
## # A tibble: 2 x 3
## fraction `mean(mean)` `sd(mean)`
## <fct> <dbl> <dbl>
## 1 Particle 35.5 23.8
## 2 Free 24.1 8.14
# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree
invsimps_distribution_plot <-
ggplot(invsimps, aes(y = mean, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
ylab("Inverse Simpson") + xlab("Fraction") +
geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
theme(legend.position = "none", #axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Inverse Simpson vs Community-wide (Per-Liter) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production
prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=frac_bacprod)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) + # Y-axis errorbars
geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
ylab("Community Production \n (μgC/L/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
theme(legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_blank())
# Summary stats
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5589 -2.1093 -0.1969 0.8752 7.8282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.45199 2.43866 -0.185 0.85666
## mean 0.29344 0.05781 5.076 0.00048 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.571 on 10 degrees of freedom
## Multiple R-squared: 0.7204, Adjusted R-squared: 0.6925
## F-statistic: 25.77 on 1 and 10 DF, p-value: 0.0004804
# WITHOUT THE TWO HIGH POINTS
summary(lm(frac_bacprod ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
##
## Call:
## lm(formula = frac_bacprod ~ mean, data = filter(invsimps, fraction ==
## "Particle" & mean < 70))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3309 -1.6323 -0.3605 1.4988 6.5349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.61837 2.49960 0.647 0.5355
## mean 0.20554 0.08156 2.520 0.0358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.688 on 8 degrees of freedom
## Multiple R-squared: 0.4425, Adjusted R-squared: 0.3729
## F-statistic: 6.351 on 1 and 8 DF, p-value: 0.03581
################ Inverse Simpson vs Per Capitra (Per-Cell) Production
## 1. Extract the R2 and p-value from the linear model:
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]), digits = 4), ")")
## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production
percell_prod_vs_invsimps_plot <-
ggplot(invsimps, aes(x=mean, y=log10(fracprod_per_cell_noinf), fill = fraction, color = fraction)) +
geom_errorbarh(aes(xmin = mean - sd, xmax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_point(size = 3.5, color = "black", aes(shape = season)) +
scale_color_manual(values = fraction_colors, guide = TRUE) +
scale_fill_manual(values = fraction_colors, guide = FALSE) +
scale_shape_manual(values = season_shapes) +
ylab("log10(production/cell)\n (μgC/cell/day)") +
xlab("Inverse Simpson") +
geom_smooth(data=filter(invsimps, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") +
scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) +
#scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) +
# Add the R2 and p-value to the plot
annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
guides(color = guide_legend(override.aes = list(shape = 15))) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
# Summary stats
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28651 -0.18384 -0.11125 0.07337 0.56444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.360961 0.159490 -46.153 5.27e-12 ***
## mean 0.018087 0.003759 4.812 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2969 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.7201, Adjusted R-squared: 0.689
## F-statistic: 23.15 on 1 and 9 DF, p-value: 0.0009581
# WITHOUT THE TWO HIGH POINTS
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70)))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Particle" & mean < 70))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23986 -0.19120 -0.03481 0.06995 0.49119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.174160 0.165595 -43.324 9.11e-10 ***
## mean 0.009591 0.005612 1.709 0.131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.243 on 7 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2944, Adjusted R-squared: 0.1936
## F-statistic: 2.921 on 1 and 7 DF, p-value: 0.1312
summary(lm(log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps, fraction == "Particle" & mean < 70 & log10(fracprod_per_cell_noinf) < 1e-7)))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mean, data = filter(invsimps,
## fraction == "Particle" & mean < 70 & log10(fracprod_per_cell_noinf) <
## 1e-07))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23986 -0.19120 -0.03481 0.06995 0.49119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.174160 0.165595 -43.324 9.11e-10 ***
## mean 0.009591 0.005612 1.709 0.131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.243 on 7 degrees of freedom
## Multiple R-squared: 0.2944, Adjusted R-squared: 0.1936
## F-statistic: 2.921 on 1 and 7 DF, p-value: 0.1312
# Plot Inverse simpson plots altogether
invsimps_plots <- plot_grid(invsimps_distribution_plot, prod_vs_invsimps_plot, percell_prod_vs_invsimps_plot,
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1.1),
align = "v")
# All 3 richness plots together
rich_plots <- plot_grid(rich_distribution_plot, prod_vs_rich_plot,
percell_prod_vs_rich_plot + theme(legend.position = "none"),
labels = c("A", "C", "E"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
invsimps_plots_noyaxis <-
plot_grid(invsimps_distribution_plot + theme(axis.title.y = element_blank(), axis.text.y = element_blank()),
prod_vs_invsimps_plot + theme(axis.title.y = element_blank()),
percell_prod_vs_invsimps_plot + theme(axis.title.y = element_blank(), legend.position = "none"),
labels = c("B", "D", "F"), ncol = 1, nrow = 3,
rel_heights = c(0.5, 0.8, 1),
align = "v")
# Extract distribution plots of richness and inverse simpson
figure2_row1 <- plot_grid(rich_plots, invsimps_plots_noyaxis,
ncol = 2, nrow = 1, rel_widths = c(1, 0.75),
align = "h")
######## PLOT FIGURE 2
plot_grid(figure2_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Figure 3
Unweighted ses.mpd
######################################################### SES MPD DISTRIBUTION: UNWEIGHTED
################ Figure 3A: Distribution of Particle and Free Unweighted Mean Pairwise distance
## 1. Is there a difference between particle and free Unweighted Mean Pairwise distance?
unweighted_fraction_wilcox <- wilcox.test(mpd.obs.z ~ fraction, data = unweighted_df)
unweighted_fraction_wilcox
##
## Wilcoxon rank sum test
##
## data: mpd.obs.z by fraction
## W = 30, p-value = 0.01449
## alternative hypothesis: true location shift is not equal to 0
# 2. What are the means of particle and free unweighted PD?
unweighted_df %>%
group_by(fraction) %>%
summarize(mean(mpd.obs.z))
## # A tibble: 2 x 2
## fraction `mean(mpd.obs.z)`
## <fct> <dbl>
## 1 Particle -0.497
## 2 Free 0.438
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
# Make a data frame to draw significance line in boxplot (visually calculated)
dat4 <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1.6,1.7,1.7,1.6)) # WholePart vs WholeFree
unweight_distribution_plot <-
ggplot(unweighted_df, aes(y = mpd.obs.z, x = fraction)) +
scale_fill_manual(values = fraction_colors) +
geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) +
geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
scale_shape_manual(values = season_shapes) +
scale_y_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
ylab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
xlab("Fraction") +
geom_hline(yintercept = 0, linetype = "dashed", size = 1.5) +
##### Particle vs free per-cell production
geom_path(data = dat4, aes(x = a, y = b), linetype = 1, color = "#424645") +
annotate("text", x=1.5, y=1.85, size = 8, color = "#424645", label= paste("*"), angle = 90) +
annotate("text", x=1.5, y=1.35, size = 4, color = "#424645",
label= paste("p =", round(unweighted_fraction_wilcox$p.value, digits = 2))) +
theme(legend.position = "none",# axis.title.y = element_blank(),
axis.title.x = element_blank(), axis.text.x = element_blank()) +
coord_flip()
################ Figure 3B: Unweighted Phylogenetic Diversity vs Richness
## 1. Is there a relationship between richness and Unweighted Mean Pairwise distance?
lm_richness_vs_unweight <- lm(mean ~ mpd.obs.z, data = unweighted_df)
# Linear model results for particle-associated only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -191.79 -112.16 -41.89 55.90 278.87
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 515.15 51.16 10.069 1.49e-06 ***
## mpd.obs.z -83.78 49.91 -1.679 0.124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 155 on 10 degrees of freedom
## Multiple R-squared: 0.2199, Adjusted R-squared: 0.1419
## F-statistic: 2.818 on 1 and 10 DF, p-value: 0.1241
# Linear model results for free-living only
summary(lm(mean ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = mean ~ mpd.obs.z, data = filter(unweighted_df, fraction ==
## "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -100.15 -66.22 -18.70 43.65 172.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 337.088 42.753 7.884 1.34e-05 ***
## mpd.obs.z 3.053 77.510 0.039 0.969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.18 on 10 degrees of freedom
## Multiple R-squared: 0.0001551, Adjusted R-squared: -0.09983
## F-statistic: 0.001551 on 1 and 10 DF, p-value: 0.9694
## 2. Extract the R2 and p-value from the linear model:
lm_lab_richness_vs_unweight<- paste("atop(R^2 ==", round(summary(lm_richness_vs_unweight)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(lm_richness_vs_unweight)$coefficients[,4][2]), digits = 3), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Richness
unweight_rich_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = mean, x = mpd.obs.z)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) + ylab("\n Observed Richness") +
scale_shape_manual(values = season_shapes) +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_fill_manual(values = fraction_colors) +
geom_smooth(method = "lm", color = "#424645", fill = "#424645", alpha = 0.3) +
scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=750, label=lm_lab_richness_vs_unweight, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3C: Unweighted Mean Pairwise distance vs Community-Wide Production
# Is there a relationship between Production and Unweighted Mean Pairwise distance?
summary(lm(frac_bacprod ~ mpd.obs.z, data = unweighted_df)) # NS
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = unweighted_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.416 -9.547 -3.120 8.193 44.030
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.125 3.102 5.520 1.51e-05 ***
## mpd.obs.z 3.901 3.768 1.035 0.312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.19 on 22 degrees of freedom
## Multiple R-squared: 0.04645, Adjusted R-squared: 0.003106
## F-statistic: 1.072 on 1 and 22 DF, p-value: 0.3118
# Linear model results for particle-associated only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.145 -4.312 -1.824 3.402 19.527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.213 2.617 3.138 0.0105 *
## mpd.obs.z -3.511 2.553 -1.376 0.1990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.928 on 10 degrees of freedom
## Multiple R-squared: 0.1591, Adjusted R-squared: 0.07501
## F-statistic: 1.892 on 1 and 10 DF, p-value: 0.199
# Linear model results for free-living only
summary(lm(frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = frac_bacprod ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.996 -14.939 2.192 8.751 37.004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.191 8.398 2.166 0.0555 .
## mpd.obs.z 13.410 15.225 0.881 0.3991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.71 on 10 degrees of freedom
## Multiple R-squared: 0.072, Adjusted R-squared: -0.0208
## F-statistic: 0.7758 on 1 and 10 DF, p-value: 0.3991
unweight_prod_vs_mpd_plot <-
ggplot(unweighted_df, aes(y = frac_bacprod, x = mpd.obs.z, fill = fraction)) +
geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction), alpha = 0.7) + # X-axis errorbars
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(shape = season)) +
scale_shape_manual(values = season_shapes) +
ylab("\n Community Production \n (μgC/L/day)") +
xlab("Standardized Effect Size \n Unweighted Mean Pairwise Dist") +
scale_color_manual(values = fraction_colors) +
scale_fill_manual(values = fraction_colors) +
scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
theme(legend.title = element_blank(), legend.position = "none",
axis.text.x = element_blank(), axis.title.x = element_blank())
################ Figure 3D: Unweighted Phylogenetic Diversity vs Per Capita (Per-Cell) Production
## 1. Run the linear model: Is there a relationship between PER-CELL PRODUCTION and Unweighted Mean Pairwise distance?
unweight_vs_percell <- lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
summary(unweight_vs_percell)
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = unweighted_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.72265 -0.28974 0.00388 0.12956 1.31921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.19878 0.09989 -72.065 < 2e-16 ***
## mpd.obs.z -0.49728 0.12046 -4.128 0.000478 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4778 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.448, Adjusted R-squared: 0.4217
## F-statistic: 17.04 on 1 and 21 DF, p-value: 0.0004784
summary(aov(log10(fracprod_per_cell_noinf) ~ mpd.obs.z +fraction, data = unweighted_df))
## Df Sum Sq Mean Sq F value Pr(>F)
## mpd.obs.z 1 3.890 3.890 20.749 0.000192 ***
## fraction 1 1.044 1.044 5.569 0.028556 *
## Residuals 20 3.750 0.188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
summary(aov(log10(fracprod_per_cell_noinf) ~ fraction + mpd.obs.z, data = unweighted_df))
## Df Sum Sq Mean Sq F value Pr(>F)
## fraction 1 4.139 4.139 22.075 0.000138 ***
## mpd.obs.z 1 0.796 0.796 4.243 0.052673 .
## Residuals 20 3.750 0.188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
lmer(log10(fracprod_per_cell_noinf) ~ mpd.obs.z + (fraction) +
(0+dummy(fraction, "Particle")), data = Orthodont)
## Error in lmer(log10(fracprod_per_cell_noinf) ~ mpd.obs.z + (fraction) + : could not find function "lmer"
library(lme4)
# Look for the marginal R2
null<-lmer(log10(fracprod_per_cell_noinf)~(1|fraction),data=unweighted_df)
full<-lmer(log10(fracprod_per_cell_noinf)~mpd.obs.z+(1| fraction),data=unweighted_df)
anova(null,full)
## Data: unweighted_df
## Models:
## null: log10(fracprod_per_cell_noinf) ~ (1 | fraction)
## full: log10(fracprod_per_cell_noinf) ~ mpd.obs.z + (1 | fraction)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## null 3 40.590 43.997 -17.295 34.590
## full 4 36.478 41.020 -14.239 28.478 6.1117 1 0.01343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(full)
## Analysis of Variance Table
## Df Sum Sq Mean Sq F value
## mpd.obs.z 1 1.092 1.092 5.8239
summary(lm(log10(fracprod_per_cell_noinf) ~ fraction, data = unweighted_df))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ fraction, data = unweighted_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64696 -0.35864 0.04763 0.17090 1.25132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.7258 0.1403 -47.946 < 2e-16 ***
## fractionFree -0.8492 0.1942 -4.373 0.000266 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4652 on 21 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4766, Adjusted R-squared: 0.4517
## F-statistic: 19.12 on 1 and 21 DF, p-value: 0.0002664
# Linear model results for particle-associated only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Particle")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Particle"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3795 -0.2419 -0.1651 0.0443 1.1815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.9248 0.1711 -40.475 1.71e-11 ***
## mpd.obs.z -0.3299 0.1627 -2.028 0.0732 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4649 on 9 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.2373
## F-statistic: 4.112 on 1 and 9 DF, p-value: 0.07319
# Linear model results for free-living only
summary(lm(log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df, fraction == "Free")))
##
## Call:
## lm(formula = log10(fracprod_per_cell_noinf) ~ mpd.obs.z, data = filter(unweighted_df,
## fraction == "Free"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63978 -0.29782 0.07694 0.17610 0.76503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.55622 0.19603 -38.545 3.3e-12 ***
## mpd.obs.z -0.04303 0.35540 -0.121 0.906
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4135 on 10 degrees of freedom
## Multiple R-squared: 0.001463, Adjusted R-squared: -0.09839
## F-statistic: 0.01466 on 1 and 10 DF, p-value: 0.906
## 2. Extract the R2 and p-value from the linear model:
lm_lab_percell_unweightPD <- paste("atop(R^2 ==", round(summary(unweight_vs_percell)$adj.r.squared, digits = 2), ",",
"p ==", round(unname(summary(unweight_vs_percell)$coefficients[,4][2]), digits = 4), ")")
## 3. Plot Unweighted Phylogenetic Diversity vs Per Capitra (Per-Cell) Production
unweight_percell_vs_mpd_plot <-
ggplot(unweighted_df,
aes(y = log10(fracprod_per_cell_noinf), x = mpd.obs.z)) +
geom_vline(xintercept = 0, linetype = "dashed", size = 1.5) +
geom_point(size = 3, aes(fill = fraction, shape = season)) +
ylab("\n log10(Per-Capita Production)\n (μgC/cell/day)") +
xlab("Unweighted Phylogenetic Diversity") +
scale_fill_manual(values = fraction_colors) +
scale_shape_manual(values = season_shapes) +
geom_smooth(method = "lm", color="#424645", fill = "#424645", alpha = 0.3) +
scale_y_continuous(limits = c(-8.5,-5), breaks = c(-8, -7, -6)) +
scale_x_continuous(limits = c(-1.5,1.85), breaks = seq(from = -1.5, to = 1.5, by = 0.5)) +
# Add the R2 and p-value to the plot
annotate("text", x=0.75, y=-6, label=lm_lab_percell_unweightPD, parse = TRUE, color = "#424645", size = 4) +
theme(legend.title = element_blank(), legend.position ="bottom",
legend.text = element_text(size = 14))
######## FIGURE 3
figure3_row1 <- plot_grid(unweight_distribution_plot, unweight_rich_vs_mpd_plot, unweight_prod_vs_mpd_plot,
unweight_percell_vs_mpd_plot + theme(legend.position = "none"),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4,
rel_heights = c(0.5, 1, 1, 1.2),
align = "v")
plot_grid(figure3_row1, season_legend,
ncol = 1, nrow = 2,
rel_heights = c(1, 0.05))
Lasso Regressions
Prepare the data for Lasso Regressions
unweight_vals <- unweighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Unweighted_PD = mpd.obs.z)
weighted_vals <- weighted_df %>%
dplyr::select(mpd.obs.z, norep_filter_name) %>%
rename(Weighted_PD = mpd.obs.z)
wide_all_divs <- otu_alphadiv %>%
dplyr::select(norep_filter_name ,mean, measure) %>%
spread(measure, mean)
lasso_data_df <- metadata_pca %>%
left_join(wide_all_divs, by = "norep_filter_name") %>%
left_join(unweight_vals, by = "norep_filter_name") %>%
left_join(weighted_vals, by = "norep_filter_name") %>%
dplyr::select(-c(project, year, Date, limnion, norep_water_name, dnaconcrep1, perc_attached_bacprod,
SD_perc_attached_bacprod, SE_total_bac_abund, SE_perc_attached_abund, SE_attached_bac))
## Warning: Column `norep_filter_name` joining character vector and factor, coercing into character vector
### PARTICLE DATA
lasso_data_df_particle <- lasso_data_df %>%
filter(fraction == "Particle" ) %>%
dplyr::select(-fraction)
lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station))
percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
### FREE DATA
lasso_data_df_free <- lasso_data_df %>%
filter(fraction == "Free" ) %>%
dplyr::select(-fraction)
lasso_data_df_free_noprod <- lasso_data_df_free %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station))
percell_lasso_data_df_free_noprod <- lasso_data_df_free %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
## ALL DATA
all_dat_lasso_comm <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod,
norep_filter_name, BGA_cellspermL, Turb_NTU,
lakesite, season, station, fraction))
all_dat_lasso_percapita <- lasso_data_df %>%
dplyr::select(-c(fracprod_per_cell, frac_bacprod,
tot_bacprod, SD_tot_bacprod,SD_frac_bacprod, fraction,
lakesite, season, norep_filter_name, station, BGA_cellspermL, Turb_NTU)) %>%
dplyr::filter(Temp_C != 14.28) # Remove the missing row of data :( )
Perform Lasso regression
Bulk Community Production: Particle
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data <-
lasso_data_df_particle_noprod %>% # Use data only for the particle samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.375897
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.478053e-17
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 1.300983e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.478053e-17
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 1.300983e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
The lasso model uses Inverse Simpson as the best and only predictor of production!
Community-Wide Production: Free
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_free <-
lasso_data_df_free_noprod %>% # Use data only for the free-living samples
scale() %>% # Scale all of the variables to have mean = 0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_free)[,-1]
y <- scaled_comm_data_free$frac_bacprod
grid <- 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train_free_comm <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train_free_comm)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train_free_comm, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test
## [1] 1.659002
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.028226e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -5.122124e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL 2.183338e-02
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 2.171826e-01
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD 1.397650e-01
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.292254e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.218982e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 2.514746e-02
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
PC5 and pH are the two best predictors of community-wide heterotrophic bacterial production in the free-living habitat.
Per-capita Production: Particle
scaled_percapita_data <- percell_lasso_data_df_particle_noprod %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf)) %>%
scale() %>%
as.data.frame()
set.seed(777)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data)[,-1]
y = scaled_percapita_data$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.743068
## Run lasso on the entire dataset with the best lasso
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.322846e-16
## Sample_depth_m .
## Temp_C -1.371144e-01
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund -5.741093e-03
## attached_bac -3.957476e-02
## perc_attached_abund .
## fraction_bac_abund -1.164412e-16
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 1.240751e-02
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 4.388114e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lasso
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.173111e-16
## Sample_depth_m .
## Temp_C -8.717767e-02
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson 3.530134e-01
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
Temperature and Inverse Simpson’s Index are the two best predictors of particle-associated per-capita production.
Per-capita Production: Free
scaled_percapita_data_free <- percell_lasso_data_df_free_noprod %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf)) %>%
scale() %>%
as.data.frame()
set.seed(777)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_data_free)[,-1]
y = scaled_percapita_data_free$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] TRUE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 1.159787
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.365298e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -1.017130e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1.365298e-15
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -1.017130e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
pH is the only predictors of free-living per-capita production.
Community-wide Production: All Samples
# Set the seed for reproducibility of the grid values
set.seed(777)
################ PREPARE DATA ################
# Subset data needed and scale all o
scaled_comm_data_ALL <-
all_dat_lasso_comm %>% # Use community-wide production data only for the all samples
scale() %>% # Scale all of the variables to have mean =0 and sd = 1
as.data.frame() # Make it a dataframe so that model.matrix function works (does not take a matrix)
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_ALL)[,-1]
y <- scaled_comm_data_ALL$frac_bacprod
grid <- 10^seq(10,-2,length = 100)
################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]
################ LASSO ################
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
best_lasso_lambda
## [1] 0.3946859
lasso_lambda_1se
## [1] 0.7569721
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## [1] 0.5662862
## Run lasso on the entire dataset with the best lambda value
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.328268e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -2.134918e-01
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund 4.841677e-02
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
## Run lasso on the entire dataset with the best lambda value
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 34 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.081668e-17
## Sample_depth_m 0.000000e+00
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## attached_bac .
## perc_attached_abund .
## fraction_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness .
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
As we can see in Figure 2 and Table S1, there is much less variation explained in this model by a linear regression. Here, the result of the “one-standard-error” approach shows that the lasso is unable to pick a variable. Therefore, no variables are chosen.
Per-capita Production: All Samples
set.seed(777)
scaled_percapita_ALL <- all_dat_lasso_percapita %>%
dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
dplyr::select(-c(fracprod_per_cell_noinf, perc_attached_abund, fraction_bac_abund, attached_bac)) %>%
scale() %>%
as.data.frame()
# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(log10_percell ~ ., scaled_percapita_ALL)[,-1]
y = scaled_percapita_ALL$log10_percell
grid = 10^seq(10,-2,length = 100)
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
################ LASSO
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)
# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(cv_lasso_divs)
best_lasso_lambda <- cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se
## [1] FALSE
lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2)
## Warning in lasso_divs_pred - y_test: longer object length is not a multiple of shorter object length
## Error in mean((lasso_divs_pred - y_test)^2): dims [product 11] do not match the length of object [12]
## Run lasso on the entire dataset
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.520111e-16
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH -6.755466e-02
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness 4.154369e-01
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD -6.310163e-02
## Weighted_PD .
## Run lasso on the entire dataset
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients?
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.854064e-18
## Sample_depth_m .
## Temp_C .
## SpCond_uSpercm .
## TDS_mgperL .
## pH .
## ORP_mV .
## Chl_Lab_ugperL .
## Cl_mgperL .
## SO4_mgperL .
## NO3_mgperL .
## NH3_mgperL .
## TKN_mgperL .
## SRP_ugperL .
## TP_ugperL .
## Alk_mgperL .
## DO_mgperL .
## DO_percent .
## total_bac_abund .
## PC1 .
## PC2 .
## PC3 .
## PC4 .
## PC5 .
## PC6 .
## Richness 9.842520e-02
## Shannon_Entropy .
## Inverse_Simpson .
## Simpsons_Evenness .
## Unweighted_PD .
## Weighted_PD .
As we can see in Figure S7 and Table S2, Richness is the best predictor of per-capita heterotrophic production when all samples are used in the model.